一. 序
這篇主要用python實作TFIDF,來表示文本的方式
二. 載入套件與文本
import jieba
import math
# 載入繁體
jieba.set_dictionary('dict.txt.big')
# 來自ithome的文章摘要,來源: https://www.ithome.com.tw/news/146142
text_a = '從GPT-3衍生改良而來的Codex模型,能夠將使用者的自然語言指令轉換為程式碼,OpenAI現在以私人測試的方式釋出CodexAPI'
# 來自ithome的文章摘要,來源: https://www.ithome.com.tw/news/145743
text_b = 'Blender2.0除了能即時搜尋網路資訊,臉書也為其打造新的神經模組,可根據之前使用者與它的聊天脈絡來累積記憶'
texta_seg = jieba.lcut(text_a)
textb_seg = jieba.lcut(text_b)
unique_words = set(texta_seg).union(set(textb_seg)) ##所有文件中的單詞
# 建立2個新字典,分別存2篇文章詞的出現次數
num_words_a = dict.fromkeys(unique_words, 0)
num_words_b = dict.fromkeys(unique_words, 0)
for word in texta_seg:
num_words_a[word] += 1
for word in textb_seg:
num_words_b[word] += 1
三. 實作TF與IDF的function
def get_TF_value(w_dict, text_seg_len):
tf_dict = {}
for w, count in w_dict.items():
# 計算tf的公式
tf_dict[w] = count / float(text_seg_len)
return tf_dict
def get_IDF_value(text_list, all_words):
idf_dict = dict.fromkeys(all_words.keys(), 0)
for text in text_list:
for w, val in text.items():
# 表示出現過在一次文本中
if val > 0:
idf_dict[w] += 1
for w, val in idf_dict.items():
# 計算idf的公式
idf_dict[w] = math.log(len(text_list) / float(val))
return idf_dict
三. 計算tfidf
tf_a = get_TF_value(num_words_a, len(texta_seg))
tf_b = get_TF_value(num_words_b, len(textb_seg))
idf = get_IDF_value([num_words_a, num_words_b], num_words_a)
# 計算tfidf
tfidf_a = {}
tfidf_b = {}
for w, val in tf_a.items():
tfidf_a[w] = val * idf[w]
for w, val in tf_b.items():
tfidf_b[w] = val * idf[w]
{'能': 0.0,
'來': 0.0,
'即時': 0.0,
'而來': 0.023104906018664842,
'可': 0.0,
',': 0.0,
'之前': 0.0,
'模組': 0.0,
'指令': 0.023104906018664842,
'的': 0.0,
'測試': 0.023104906018664842,
'也': 0.0,
'使用者': 0.0,
'3': 0.023104906018664842,
...}
四. 用TFIDF表示成句字/文本
# 創建一個表示text a的list
bow_a = []
# 將tfidf_a帶入即可
for w, val in tfidf_a.items():
bow_a.append(val)
print(bow_a)
[0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842]
tfidf_a的output如下
對比一下,tfidf得值會是一樣的,這樣也就是'從GPT-3衍生改良而來的Codex模型,能夠將使用者的自然語言指令轉換為程式碼,OpenAI現在以私人測試的方式釋出CodexAPI' 這個句子用TFIDF表示的方式了~~